# Main analysis code for "Getting Out the (Newly-Enfranchised) Vote"
# see readme.txt for discussion of deidentified dataset used
# email Ariel (arwhi@mit.edu) with questions

rm(list=ls())
library(estimatr) 
library(modelsummary)
library(car)
library(xtable)
library(ggplot2)
library(dplyr)
library(gridExtra)
library(gt)

#load main analysis dataset (deidentified; produced by "record_linkage_final_forinspection.R"
maindata <- read.csv("NJISJ_fall2021exp_deid.csv")

maindata$HHclusternum <- as.numeric(as.factor(maindata$hh_id)) #format clusters for estimatr (for SE's)
length(unique(maindata$hh_id));length(unique(maindata$HHclusternum)) 

##### covariate balance (Table 3 in SI)
table(maindata$source_list) #P1/P2 are parole and P3 probation
maindata$probation <- ifelse(maindata$source_list=="P3", 1, 0); summary(maindata$probation)
maindata$white <- ifelse(maindata$race_model=="White", 1, ifelse(maindata$race_model=="Unknown",NA, 0)); summary(maindata$white)
maindata$black <- ifelse(maindata$race_model=="Black", 1, ifelse(maindata$race_model=="Unknown",NA, 0)); summary(maindata$black)
maindata$latinx <- ifelse(maindata$race_model=="Latinx", 1, ifelse(maindata$race_model=="Unknown",NA, 0)); summary(maindata$latinx)
maindata$aapi <- ifelse(maindata$race_model=="AAPI", 1, ifelse(maindata$race_model=="Unknown",NA, 0)); summary(maindata$aapi)

#make balance table with F-tests comparing arms to baseline (control)
age_test <- lm(ageyears ~ as.factor(assign_treat), data=maindata); summary(age_test)
woman_test <- lm(gender ~ as.factor(assign_treat) , data=maindata); summary(woman_test)
probation_test <- lm(probation ~ as.factor(assign_treat) , data=maindata); summary(probation_test)
white_test <- lm(white ~ as.factor(assign_treat) , data=maindata); summary(white_test)
black_test <- lm(black ~ as.factor(assign_treat) , data=maindata); summary(black_test)
latinx_test <- lm(latinx ~ as.factor(assign_treat) , data=maindata); summary(latinx_test)
ptreg_test <- lm(registeredpretreat ~ as.factor(assign_treat) , data=maindata); summary(ptreg_test) 

#build table 
model.summary = coef(summary(age_test))[, 1:2]
model.summary[,2] <- paste("(",sprintf("%.2f", round(model.summary[,2],2)), ")", sep="")
model.summary[1,2] <- ""
model.summary[,1] <- sprintf("%.2f",round(as.numeric(model.summary[,1]),2))
fstat <- summary(age_test)$fstatistic 
model.summary <-rbind(model.summary, c(sprintf("%.2f",pf(fstat[1], fstat[2], fstat[3], lower.tail=FALSE)),"")) 
allmodels <- list(model.summary) #this is one model (which list are you in)

models <- c( "woman_test", "probation_test","white_test", "latinx_test", "black_test", "ptreg_test") #now bring in all other covariate models
for (i in 1:length(models)){ #loop over them and format in the same way as above
	model.sum = coef(summary(get(models[i])))[, 1:2]
	model.sum[,2] <- paste("(",sprintf("%.2f",round(model.sum[,2],2)), ")", sep="")
	model.sum[1,2] <- ""
	model.sum[,1] <- sprintf("%.2f",round(as.numeric(model.sum[,1]),2))
	fstat <- summary(get(models[i]))$fstatistic 
	model.sum <-rbind(model.sum, c(sprintf("%.2f",pf(fstat[1], fstat[2], fstat[3], lower.tail=FALSE),2),"")) 
	allmodels[[i+1]] <- model.sum }
modeltab <- do.call("cbind", allmodels) #stick all together
modeltab <-cbind(modeltab, c(nrow(maindata[maindata$assign_treat== "Control",]),nrow(maindata[maindata$assign_treat== "Information-Only Mail",]),nrow(maindata[maindata$assign_treat== "Research-Informed Mail",]) , "" )) 

#output table
modeltable <- xtable(t(modeltab), 
                     label = "tab:sumstats", 
                     caption="Covariate balance across treatment arms",
                     align = "lcccc") 
colnames(modeltable) <- c("Control Mean", "Info-Only Mail", "Research-Informed Mail", "Joint F-test p-val")
rownames(modeltable) <- c("Age", " ", "Female", "  ", "Probation", "   ", "White", "    ",  "Latinx", "      ","Black", "       ","Already Registered", "        ", "Observations")

print(modeltable,
      caption.placement = "top",
      hline.after = c(-1,0, 14,15), 
      file="NJISJ_balancetestsbyarm_Ftests.tex")


### Analysis

## main regressions: separate arms
# set up with clustered SE's to account for HH-level (address) clustering)
# using stata-style bc if I use their default clustering approach it crashes as in JC's message here: https://github.com/DeclareDesign/estimatr/issues/318

registration_arms_covptreg <- lm_robust(registered ~ assign_treat +registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_covptreg)
turnout_arms_covptreg  <- lm_robust(voted_21 ~ assign_treat+registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(turnout_arms_covptreg)
updatedreg_arms_covptreg  <- lm_robust(updatedregistration ~ assign_treat+registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(updatedreg_arms_covptreg)

registration_arms_nocov <- lm_robust(registered ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_nocov)
turnout_arms_nocov <- lm_robust(voted_21 ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(turnout_arms_nocov)
updatedreg_arms_nocov <- lm_robust(updatedregistration ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata)  
summary(updatedreg_arms_nocov)

## version comparing specs (Table 5 in SI)
mainmodelslong <-list(registration_arms_nocov, registration_arms_covptreg, updatedreg_arms_nocov, updatedreg_arms_covptreg, turnout_arms_nocov, turnout_arms_covptreg)
names(mainmodelslong) <- c("Baseline Reg","Baseline Reg", "(Re)Reg", "(Re)Reg", "Voted", "Voted") 
modelsummary(mainmodelslong,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Info-Only Mail", "assign_treatResearch-Informed Mail" = "Research-Informed Mail ", "registeredpretreat" = "Already Registered" ),
	title="Registration and Voting by Treatment Arm \\label{nocovarcomptable}",
	output = "regression_regturnout_incpretreatreg.tex")

## main table with regressions and row about arm comparisons (Table 1 in main paper)
mainmodels <-list(registration_arms_covptreg,updatedreg_arms_covptreg, turnout_arms_covptreg) 
names(mainmodels) <- c("Baseline Registration","(Re)Registration", "Nov 2021 Turnout")

#test for difference of treatment arms
difftest_reg <- lh_robust(registered ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata, linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail")
difftest_voted <- lh_robust(voted_21 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata, linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail") 
difftest_updatedreg <- lh_robust(updatedregistration ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata, linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail") 


LHrows <- modelsummary(list(difftest_reg$lh, difftest_updatedreg$lh, difftest_voted$lh),fmt=4,
          coef_rename = c("assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail" = "Difference: T2-T1"),
          gof_omit = ".*",stars = c('*' = .05),
          output = "data.frame")[, c(2, 4:6)]
attr(LHrows, "position") = c(5,6)
LHrows[2,1] <- "" 
controlmeans <- c("Control Mean", round(mean(maindata[maindata$assign_treat=="Control", "registered"]),4), round(mean(maindata[maindata$assign_treat=="Control", "updatedregistration"]),4), round(mean(maindata[maindata$assign_treat=="Control", "voted_21"]),4)) #add control-grp means
LHrows1 <- rbind(LHrows, controlmeans)

modelsummary(mainmodels,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_omit="(Intercept)|registeredpretreat", 
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)" ),add_rows=LHrows1,fmt=4,
	title="Registration and Voting by Treatment Arm\\label{maincoefftable}",
	notes="All models include pre-treatment registration status as a covariate",
	output = "main_regression_regturnout_incpretreatreg_stackedpanel.tex")
#commenting out this experimentation about wanting to add an hline above the obs count---come back and discuss with team
#tabout <- modelsummary(mainmodels,  
#	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
#	coef_omit="(Intercept)|registeredpretreat", 
#	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail", "assign_treatResearch-Informed Mail" = "Research-Informed Mail " ),add_rows=LHrows1,
#	title="Registration and Voting by Treatment Arm\\label{maincoefftable}",
#	notes="All models include pre-treatment registration status as a covariate")
#	#output = "../figures/main_regression_regturnout_incpretreatreg_stackedpanel.tex")

#tabout %>%
#  row_spec(6, extra_css = "border-bottom: 3px solid") %>% 
#  save_kable("../figures/main_regression_regturnout_incpretreatreg_stackedpanel2.tex",float = FALSE) #still no.
##what if we instead grab the tex output from modelsummary
##and then just manually shove an hline in there while outputting via writelines?
##well, ck with team first before fussing more.

### Dropping in here Aug 2024 to see about reviewer suggestion of running as logit instead---will make a table and discuss pros/cons with team, then clean up code
library(miceadds)

registration_arms_covptreg_logit <- miceadds::glm.cluster(data=maindata, formula=registered ~ assign_treat +registeredpretreat,
                              cluster="HHclusternum", family="binomial")
summary(registration_arms_covptreg_logit)
turnout_arms_covptreg_logit <- miceadds::glm.cluster(data=maindata, formula=voted_21 ~ assign_treat +registeredpretreat,
                              cluster="HHclusternum", family="binomial")
summary(turnout_arms_covptreg_logit)
updatedreg_arms_covptreg_logit <- miceadds::glm.cluster(data=maindata, formula=updatedregistration ~ assign_treat +registeredpretreat,
                              cluster="HHclusternum", family="binomial")
summary(updatedreg_arms_covptreg_logit)

#make this into a table?

library(texreg)
texreg(l=list(registration_arms_covptreg_logit, updatedreg_arms_covptreg_logit, turnout_arms_covptreg_logit), stars = 0.05, 
	custom.model.names = c("Baseline Registration","(Re)Registration", "November 2021 Turnout"),
	custom.coef.names = c("Intercept", "Information-Only Mail (T1)", "Research-Informed Mail (T2)", "Already Registered"),
	caption = "Logit Versions of Main Models", file="logitversions_mainmodels.tex")


## pooling treatment arms (Table 4 in SI)
maindata$anytreat <- ifelse(maindata$assign_treat=="Control", 0, 1); table(maindata$anytreat) 

registration_arms_pooled_covpt <- lm_robust(registered ~ anytreat+registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_pooled_covpt) 
turnout_arms_pooled_covpt <- lm_robust(voted_21 ~ anytreat+registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata)
summary(turnout_arms_pooled_covpt)
updatedreg_arms_pooled_covpt <- lm_robust(updatedregistration ~ anytreat+registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata)
summary(updatedreg_arms_pooled_covpt)

pooledmodels <-list(registration_arms_pooled_covpt, updatedreg_arms_pooled_covpt, turnout_arms_pooled_covpt)
names(pooledmodels) <- c("Baseline Registration","(Re)Registration", "November 2021 Turnout")
modelsummary(pooledmodels,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("anytreat" = "Any Treatment Arm", "registeredpretreat" = "Already Registered"),title="Pooling both treatment arms \\label{poolingarms}",
	output = "pooled_regression_regturnout_ptreg.tex")


### grouped coeff plot (Figure 2 in main paper)
#set up labels
regmodelname <- paste("Baseline Registration \n (Control Mean = ", round(coef(registration_arms_nocov)[1],2), ")", sep="") 
regmodelname
updregmodelname <- paste("(Re)Registration \n (Control Mean = ", round(coef(updatedreg_arms_nocov)[1],2), ")", sep="") 
updregmodelname
votemodelname <- paste("Voter Turnout \n (Control Mean = ", round(coef(turnout_arms_nocov)[1],2), ")", sep="") 
votemodelname

regcoeff <- registration_arms_covptreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model = regmodelname)

updregcoeff <- updatedreg_arms_covptreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model = updregmodelname)

votecoeff <- turnout_arms_covptreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>%
  mutate(model = votemodelname)

allcoeff <- data.frame(rbind(regcoeff, updregcoeff, votecoeff))[c(1,2,4,5,7,8),]  
allcoeff[allcoeff$term == "assign_treatInformation-Only Mail", "term"] <- "Information-Only Mail (T1)" 
allcoeff[allcoeff$term == "assign_treatResearch-Informed Mail", "term"] <- "Research-Informed Mail (T2)" 

#plot
coeffplotmain <- allcoeff %>%
 ggplot(aes(x = model, y = estimate, color=term, shape=term)) +
  geom_hline(yintercept = 0, linetype = 2) + 
  geom_point(position = position_dodge2(width = 1/4), size=3) + 
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high, width=0.001), position=position_dodge(width=1/4)) + 
  theme_bw()+
  ylab("Treatment Effect") + ggtitle("Effects of Treatment Conditions on Registration and Voting")+
  theme(plot.title = element_text(hjust = 0.5),axis.title.y=element_blank(), text = element_text(size=14)) + 
  coord_flip()+  labs(color = "Treatment Condition", shape = "Treatment Condition") 
 

coeffplotmain
dev.off()

pdf("turnoutregistration_groupedcoeffplot.pdf", width=8, height=5) 
coeffplotmain
dev.off()

### Heterogeneity by race (pre-registered)

#collapse to white/non for simple look
maindata$racemodelnonwhite <- ifelse(maindata$race_model=="White",0,1); table(maindata$racemodelnonwhite)
maindata$racemodelnonwhite <- ifelse(is.na(maindata$white), NA, maindata$racemodelnonwhite)

registration_arms_byrace_ptr <- lm_robust(registered ~ assign_treat * racemodelnonwhite+ registeredpretreat , clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_byrace_ptr) 
turnout_arms_byrace_ptr <- lm_robust(voted_21 ~ assign_treat*racemodelnonwhite + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(turnout_arms_byrace_ptr)

#registration table that splits out each group and then shows treatment interaction with "nonwhite" in the final column (Table 7 in SI)
registration_arms_nocov_white <- lm_robust(registered ~ assign_treat + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$white==1,]) 
summary(registration_arms_nocov_white)
registration_arms_nocov_latinx <- lm_robust(registered ~ assign_treat + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$latinx==1,]) 
summary(registration_arms_nocov_latinx)
registration_arms_nocov_black <- lm_robust(registered ~ assign_treat + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$black==1,]) 
summary(registration_arms_nocov_black)
registration_arms_nocov_allnonwhite <- lm_robust(registered ~ assign_treat + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$white==0,]) 
summary(registration_arms_nocov_allnonwhite)

modelsbyrace <-list(registration_arms_nocov_white, registration_arms_nocov_latinx, registration_arms_nocov_black, registration_arms_nocov_allnonwhite, registration_arms_byrace_ptr)
names(modelsbyrace) <- c("White", "Latinx", "Black", "All nonwhite", "All (Interacted)")
modelsummary(modelsbyrace,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "racemodelnonwhite"="Nonwhite", "assign_treatInformation-Only Mail:racemodelnonwhite" = "Information-Only Mail * Nonwhite", "assign_treatResearch-Informed Mail:racemodelnonwhite" = "Research-Informed Mail * Nonwhite", "registeredpretreat" = "Already Registered"),
	title="Registration Effects by Race\\label{regbyrace1}",
	output = "registration_byrace.tex") 

#same for turnout (Table 8 in SI)
turnout_arms_nocov_white <- lm_robust(voted_21 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$white==1,]) 
summary(turnout_arms_nocov_white)
turnout_arms_nocov_latinx <- lm_robust(voted_21 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$latinx==1,]) 
summary(turnout_arms_nocov_latinx)
turnout_arms_nocov_black <- lm_robust(voted_21 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$black==1,]) 
summary(turnout_arms_nocov_black)
turnout_arms_nocov_allnonwhite <- lm_robust(voted_21 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$white==0,]) 
summary(turnout_arms_nocov_allnonwhite)

votemodelsbyrace <-list(turnout_arms_nocov_white, turnout_arms_nocov_latinx, turnout_arms_nocov_black, turnout_arms_nocov_allnonwhite, turnout_arms_byrace_ptr)
names(votemodelsbyrace) <- c("White", "Latinx", "Black", "All nonwhite", "All (Interacted)")
modelsummary(votemodelsbyrace,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "racemodelnonwhite"="Nonwhite", "assign_treatInformation-Only Mail:racemodelnonwhite" = "Information-Only Mail * Nonwhite", "assign_treatResearch-Informed Mail:racemodelnonwhite" = "Research-Informed Mail * Nonwhite", "registeredpretreat" = "Already Registered"),
	title="Turnout Effects by Race (November 2021 Election)\\label{votebyrace1}",
	output = "turnout_byrace.tex") 

#look a little further into the race situation
table(maindata$race, maindata$probation) #there's missingness for parole but for probation we never had anything
#so how does modeled race do for people we know are black in the parole data?
paroleblack <- maindata[maindata$probation==0 & maindata$race=="Black",]
table(paroleblack$race_model) #not great 

#alternative measure: backstop model with admin data 
maindata$nonwhite_alt <- ifelse(maindata$racemodelnonwhite==1, 1,ifelse(maindata$race=="Black" | maindata$race=="Latinx" | maindata$race=="AAPI", 1, 0)); table(maindata$racemodelnonwhite)
table(maindata$racemodelnonwhite, maindata$nonwhite_alt) 

registration_arms_byrace_alt <- lm_robust(registered ~ assign_treat * nonwhite_alt + registeredpretreat , clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_byrace_alt) 
turnout_arms_byrace_alt <- lm_robust(voted_21 ~ assign_treat*nonwhite_alt + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(turnout_arms_byrace_alt)

#make table (Table 9 in SI)
modelsbyrace_alt <-list(registration_arms_byrace_alt, turnout_arms_byrace_alt)
names(modelsbyrace_alt) <- c("Voter Registration", "November 2021 Turnout")
modelsummary(modelsbyrace_alt,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "nonwhite_alt"="nonwhite_alt", "assign_treatInformation-Only Mail:nonwhite_alt" = "Information-Only Mail * nonwhite_alt", "assign_treatResearch-Informed Mail:nonwhite_alt" = "Research-Informed Mail * nonwhite_alt", "registeredpretreat" = "Already Registered"),
	title="Registration and Turnout Effects by Race (alternative measure)\\label{effectsbyracealt}",
	output = "turnout_byrace_altmeasure.tex") 

#use alt measure but only with the list where we had any race data (Table 10 in SI)
maindata$nonwhite_alt2 <- ifelse(maindata$race=="White", 0,ifelse(maindata$race=="Unknown", NA, 1)); table(maindata$nonwhite_alt2)
table(maindata$racemodelnonwhite, maindata$nonwhite_alt2) 

registration_arms_byrace_alt2 <- lm_robust(registered ~ assign_treat * nonwhite_alt2 + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_byrace_alt2) 
turnout_arms_byrace_alt2 <- lm_robust(voted_21 ~ assign_treat*nonwhite_alt2+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(turnout_arms_byrace_alt2)

modelsbyrace_alt2 <-list(registration_arms_byrace_alt2, turnout_arms_byrace_alt2)
names(modelsbyrace_alt2) <- c("Voter Registration", "November 2021 Turnout")
modelsummary(modelsbyrace_alt2,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "nonwhite_alt"="nonwhite_alt", "assign_treatInformation-Only Mail:nonwhite_alt" = "Information-Only Mail * nonwhite_alt", "assign_treatResearch-Informed Mail:nonwhite_alt" = "Research-Informed Mail * nonwhite_alt", "registeredpretreat" = "Already Registered"),
	title="Registration and Turnout Effects by Race (alternative measure, using parole data only)\\label{effectsbyracealtparole}",
	output = "turnout_byrace_altmeasure_parolerecordsonly.tex") 

## Heterogeneity by list source (Table 11 in SI)

registration_arms_bysource <- lm_robust(registered ~ assign_treat * probation + registeredpretreat , clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(registration_arms_bysource) 
turnout_arms_bysource <- lm_robust(voted_21 ~ assign_treat*probation+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(turnout_arms_bysource) 
updatedreg_arms_bysource <- lm_robust(updatedregistration ~ assign_treat*probation+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(updatedreg_arms_bysource) 

modelsbysource <-list(registration_arms_bysource,updatedreg_arms_bysource, turnout_arms_bysource)
names(modelsbysource) <- c("Voter Registration", "Updated Registration","November 2021 Turnout")
modelsummary(modelsbysource,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("registeredpretreat" = "Already Registered","assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "probation"="Probation", "assign_treatInformation-Only Mail:probation" = "Information-Only Mail * Probation", "assign_treatResearch-Informed Mail:probation" = "Research-Informed Mail * Probation"),
	title="Registration and Turnout Effects by Program Contact\\label{effectsbysource}",
	output = "turnout_bylistsource.tex") 

## Exploratory look at pre-experimental registration status 
# table of those not already registered (previously Table 2 in main paper)
registration_arms_nocov_nopretreatreg <- lm_robust(registered ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==0,]) 
summary(registration_arms_nocov_nopretreatreg)

turnout_arms_nocov_nopretreatreg  <- lm_robust(voted_21 ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==0,]) 
summary(turnout_arms_nocov_nopretreatreg)

mainmodels <-list(registration_arms_nocov_nopretreatreg, turnout_arms_nocov_nopretreatreg)
names(mainmodels) <- c("Voter Registration", "November 2021 Turnout")
modelsummary(mainmodels,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "registeredpretreat" = "Already Registered"),
	title="Registration and Voting by Treatment Arm (Among those not already registered) \\label{maincoefftable_unreg}",
	output = "simplest_regression_regturnout_notalreadyregisteredsubset.tex")

#now I think I want to expand this table to also include the already-registered in additional columns?
registration_arms_nocov_pretreatreg <- lm_robust(updatedregistration ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==1,]) #note I'm using rereg here and plain reg above, since they make sense for each subset
summary(registration_arms_nocov_pretreatreg)

turnout_arms_nocov_pretreatreg  <- lm_robust(voted_21 ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==1,]) 
summary(turnout_arms_nocov_pretreatreg)

mainmodels <-list(registration_arms_nocov_nopretreatreg, turnout_arms_nocov_nopretreatreg,registration_arms_nocov_pretreatreg, turnout_arms_nocov_pretreatreg)
names(mainmodels) <- c("Voter Registration", "Turnout","(Re)Registration", "Turnout")
byreg <- modelsummary(mainmodels,output="gt",  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", "registeredpretreat" = "Already Registered"),
	title="Registration and Voting by Treatment Arm (By Previous Registration)) \\label{maincoefftable_byreg}")%>%
  tab_spanner(label = 'Not Registered', columns = 2:3) %>%
  tab_spanner(label = 'Previously Registered', columns = 4:5) 

gt::gtsave(byreg, filename ="simplest_regression_regturnout_subsetbyalreadyregistered.tex")

#but maybe also want to make it look like the main table, with T2-T1 test. 
#test for difference of treatment arms
difftest_reg_pr0 <- lh_robust(registered ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==0,], linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail")
difftest_voted_pr0 <- lh_robust(voted_21 ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==0,], linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail") 

difftest_rereg_pr1 <- lh_robust(updatedregistration ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==1,], linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail")
difftest_voted_pr1 <- lh_robust(voted_21 ~ assign_treat, clusters=HHclusternum,  se_type = "stata", data=maindata[maindata$registeredpretreat==1,], linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail") 

LHrows <- modelsummary(list(difftest_reg_pr0$lh, difftest_voted_pr0$lh, difftest_rereg_pr1$lh, difftest_voted_pr1$lh),fmt=4,
          coef_rename = c("assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail" = "Difference: T2-T1"),
          gof_omit = ".*",stars = c('*' = .05),
          output = "data.frame")[, c(2, 4:7)]
attr(LHrows, "position") = c(5,6)
LHrows[2,1] <- "" 
controlmeans <- c("Control Mean", round(mean(maindata[maindata$assign_treat=="Control" & maindata$registeredpretreat==0, "registered"]),4), round(mean(maindata[maindata$assign_treat=="Control"&maindata$registeredpretreat==0, "voted_21"]),4), round(mean(maindata[maindata$assign_treat=="Control" &maindata$registeredpretreat==1, "updatedregistration"]),4),round(mean(maindata[maindata$assign_treat=="Control" &maindata$registeredpretreat==1, "voted_21"]),4)) #add control-grp means
subsets <- c("Subset", "Unregistered", "Unregistered", "Already Registered","Already Registered")
#LHrows1 <- rbind(LHrows, subsets,controlmeans)
LHrows1 <- rbind(LHrows,controlmeans)

modelsummary(mainmodels,  output =   "simplest_regression_regturnout_alreadyregistered_stackedpanel.tex",
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_omit="(Intercept)|registeredpretreat", fmt=4,
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)" ),add_rows=LHrows1,
	title="Registration and Voting by Treatment Arm (By Previous Registration)) \\label{maincoefftable_byreg}")


### also, make separate grouped-coeff plots for previously-unreg, previously-reg (analogous to Table 3 in paper) for slides
## may delete later if don't use in the actual paper
## start with unregistered
#set up labels
regmodelname <- paste("Registration \n (Control Mean = ", round(coef(registration_arms_nocov_nopretreatreg)[1],2), ")", sep="") 
regmodelname
votemodelname <- paste("Voter Turnout \n (Control Mean = ", round(coef(turnout_arms_nocov_nopretreatreg)[1],2), ")", sep="") 
votemodelname

regcoeff <- registration_arms_nocov_nopretreatreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model = regmodelname)

votecoeff <- turnout_arms_nocov_nopretreatreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>%
  mutate(model = votemodelname)

allcoeff <- data.frame(rbind(regcoeff, votecoeff))#[c(1,2,4,5,7,8),]  
allcoeff[allcoeff$term == "assign_treatInformation-Only Mail", "term"] <- "Information-Only Mail (T1)" 
allcoeff[allcoeff$term == "assign_treatResearch-Informed Mail", "term"] <- "Research-Informed Mail (T2)" 

#plot
coeffplotmain_unreg <- allcoeff %>%
 ggplot(aes(x = model, y = estimate, color=term, shape=term)) +
  geom_hline(yintercept = 0, linetype = 2) + 
  geom_point(position = position_dodge2(width = 1/4), size=3) + 
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high, width=0.001), position=position_dodge(width=1/4)) + 
  theme_bw()+
  ylab("Treatment Effect") + ggtitle("Previously-Unregistered (n=15,760)")+
  theme(plot.title = element_text(hjust = 0.5),axis.title.y=element_blank(), text = element_text(size=14)) + 
  coord_flip()+  labs(color = "Treatment Condition", shape = "Treatment Condition") 
#dev.off()

##now same thing for registered
#set up labels
regmodelname <- paste("(Re)Registration \n (Control Mean = ", round(coef(registration_arms_nocov_pretreatreg)[1],2), ")", sep="") 
regmodelname
votemodelname <- paste("Voter Turnout \n (Control Mean = ", round(coef(turnout_arms_nocov_pretreatreg)[1],2), ")", sep="") 
votemodelname

regcoeff <- registration_arms_nocov_pretreatreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>% 
  mutate(model = regmodelname)

votecoeff <- turnout_arms_nocov_pretreatreg %>% 
  tidy %>% 
  filter(term != "(Intercept)") %>%
  mutate(model = votemodelname)

allcoeff <- data.frame(rbind(regcoeff, votecoeff))#[c(1,2,4,5,7,8),]  
allcoeff[allcoeff$term == "assign_treatInformation-Only Mail", "term"] <- "Information-Only Mail (T1)" 
allcoeff[allcoeff$term == "assign_treatResearch-Informed Mail", "term"] <- "Research-Informed Mail (T2)" 

#plot
coeffplotmain_reg <- allcoeff %>%
 ggplot(aes(x = model, y = estimate, color=term, shape=term)) +
  geom_hline(yintercept = 0, linetype = 2) + 
  geom_point(position = position_dodge2(width = 1/4), size=3) + 
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high, width=0.001), position=position_dodge(width=1/4)) + 
  theme_bw()+
  ylab("Treatment Effect") + ggtitle("Previously-Registered (n=8,008)")+
  theme(plot.title = element_text(hjust = 0.5),axis.title.y=element_blank(), text = element_text(size=14)) + 
  coord_flip()+  labs(color = "Treatment Condition", shape = "Treatment Condition") 
#dev.off()


require(gridExtra)
pdf("turnoutregistration_groupedcoeffplot_exploratoryregstatus.pdf", width=15, height=7) 
grid.arrange(coeffplotmain_unreg, coeffplotmain_reg, ncol=2)
dev.off()






#################################################
## expanded table of outcomes & reg status (SI Table 6)
m1 <- lm_robust(registered ~ assign_treat*registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
m2 <- lm_robust(updatedregistration ~ assign_treat*registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
m3 <- lm_robust(voted_21 ~ assign_treat*registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(m1)
summary(m2)
summary(m3)

mainmodels <-list(m1, m2, m3)
names(mainmodels) <- c("Baseline Registration","(Re)Registration", "Turnout")
modelsummary(mainmodels,  
             stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
             coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", 
                             "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", 
                             "registeredpretreat" = "Already Registered",
                             "assign_treatInformation-Only Mail:registeredpretreat" = "Information-Only x Already Registered",
                             "assign_treatResearch-Informed Mail:registeredpretreat" = "Research-Informed x Already Registered"),
             title="Registration and Voting by Treatment Arm and Prior Registration",
             output = "treatment_regstatus_moderation.tex")


## additional table of 2022 turnout?

m1 <- lm_robust(registered22 ~ assign_treat + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
m2 <- lm_robust(updatedregistration23 ~ assign_treat +registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
m3 <- lm_robust(voted_22 ~ assign_treat + registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata) 
summary(m1)
summary(m2)
summary(m3)

mainmodels <-list(m1, m2, m3)
#mainmodels <-list(m1, m3)
names(mainmodels) <- c("Baseline Registration","(Re)Registration", "Turnout")
#names(mainmodels) <- c("Baseline Registration", "Turnout")
modelsummary(mainmodels,  
             stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
             coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", 
                             "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)", 
                             "registeredpretreat" = "Already Registered"),
             title="Long-Run Effects on 2022 Registration and Voting",
             output = "treatment_longruneffects2022.tex")

#but maybe also want to make it look like the main table, with T2-T1 test. 
#test for difference of treatment arms
difftest_reg <- lh_robust(registered22 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata, linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail")
difftest_voted <- lh_robust(voted_22 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata, linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail") 
difftest_updatedreg <- lh_robust(updatedregistration23 ~ assign_treat+ registeredpretreat, clusters=HHclusternum,  se_type = "stata", data=maindata, linear_hypothesis = "assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail") 

LHrows <- modelsummary(list(difftest_reg$lh, difftest_updatedreg$lh, difftest_voted$lh),fmt=4,
          coef_rename = c("assign_treatResearch-Informed Mail = assign_treatInformation-Only Mail" = "Difference: T2-T1"),
          gof_omit = ".*",stars = c('*' = .05),
          output = "data.frame")[, c(2, 4:6)]
attr(LHrows, "position") = c(5,6)
LHrows[2,1] <- "" 
controlmeans <- c("Control Mean", round(mean(maindata[maindata$assign_treat=="Control", "registered22"]),4), round(mean(maindata[maindata$assign_treat=="Control", "updatedregistration23"]),4), round(mean(maindata[maindata$assign_treat=="Control", "voted_22"]),4)) #add control-grp means
LHrows1 <- rbind(LHrows, controlmeans)

modelsummary(mainmodels,  
	stars = c('*' = .05), gof_omit = 'DF|Deviance|R2|AIC|BIC|Std.',
	coef_omit="(Intercept)|registeredpretreat", 
	coef_rename = c("assign_treatInformation-Only Mail" = "Information-Only Mail (T1)", "assign_treatResearch-Informed Mail" = "Research-Informed Mail (T2)" ),add_rows=LHrows1,fmt=4,
	title="Long-Run Effects on 2022 Registration and Voting\\label{longrun}",
	notes="All models include pre-treatment registration status as a covariate",
	output = "treatment_longruneffects2022_stackedpanel.tex")


#also, take R2 suggestion and try setting up as IV?
library(ivreg)


#do need to figure out clustered SE's if using but for now keep it simple
iv1 <- ivreg(registered22 ~ registered + registeredpretreat|assign_treat + registeredpretreat ,  data=maindata) 
summary(iv1)
iv2 <- ivreg(updatedregistration23 ~ registered + registeredpretreat|assign_treat + registeredpretreat ,  data=maindata) #do need to figure out
summary(iv2)
iv3 <- ivreg(voted_22 ~ registered + registeredpretreat|assign_treat + registeredpretreat ,  data=maindata) #do need to figure out
summary(iv3)

summary(iv1, vcov = sandwich::sandwich, df = Inf)
sqrt(diag(sandwich::sandwich(iv1)))

library(stargazer)
stargazer(iv1,iv2,iv3,star.cutoffs = c(0.05), title="2SLS approach to long-run effects on 2022 outcomes",
covariate.labels =c("Registration 2021","Already Registered"), dep.var.labels=c("Registration","(Re)Registration","Turnout 2022"),  se = list(sqrt(diag(sandwich::sandwich(iv1))),sqrt(diag(sandwich::sandwich(iv2))),sqrt(diag(sandwich::sandwich(iv3)))),
	out = "longrun_IVversion.tex")

#have questions about effect size/assumptions here
